## Calculate similarity measures as in DVG (2017)
## JHL 

## Similarity measures (chose correlation only for convenience, given measures are robust as shown in DVG)

## For each pair of stores i and j in chain c, average across products and quarters 
## Weekly correlation in deviation from quarterly mean

### PRELIMINARIES 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign", "psych")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

require(folderfun) 
require(data.table)
require(bit64)
require(lubridate)
require(foreign)
require(psych)

# --array=1040,1290,1303,1362,1463,1484,1493,3603,7080,7260,7734,8404
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

# Set paths 
setff("path_home")
path_home <- ffpath_home()

repository <- sprintf("%s/dta/nielsen/pranks",path_home)
do_path <- sprintf("%s/do",path_home)
save_path <- sprintf("%s/raw/nielsen",path_home)

# Create RData data table of file names and modules
if (!file.exists(sprintf("%s/module_0614_files.RData",do_path))) {
	
	 # Much more efficient listing method using system call
		# Can also sort by size with -S, then run arrays based on size 
	 ptm <- proc.time()
	 setwd(repository)
	 # List files in repository by descending order of size
	 file.table <- system("ls *.RData -S", intern = TRUE)
	 file.table <- lapply(as.list(file.table), function(x) paste(repository,"/",x,sep=""))
	 file.table <- data.table(unlist(file.table))
	 file.table[,id:=.I]

	 names(file.table) <- c("file_name", "id") # Name data table 
	 
	file.table[ ,file:=tstrsplit(file_name,"/",fixed=T)[[length(tstrsplit(file_name,"/",fixed=T))]] ]
	 # Sample: "/project2/databases/nielsen/nielsen_extracts/RMS/Movement_Files_Combined_RFormatted/1040.RData"
	 # Extract only digits: the module
	 file.table[,module:=as.numeric(gsub("[^0-9]","",file))]

	 save(file.table, file = sprintf("%s/module_0614_files.RData",do_path)) # Save to RData for quick loading 		 
	 proc.time() - ptm

}

### 1. For each top 1 product in each selected module, compute similarity measures ###

load(sprintf("%s/module_0614_files.RData",do_path)) 

file.table <- file.table[module==myID]

# For each module and channel code, extract RMS data for top UPC
for (m in file.table[,module]){
	for(c in c("D","F","M")){

		# Read in top1 RMS data 	
		rms <- data.table(read.dta(sprintf('%s/%s_top1_%s.dta',repository,m,c)))
		# Clean dates back to R formats, compute deviation 
		rms[,':='(unit_price=price/prmult,week_end=as_date(week_end,origin="1960-01-01"))]
		rms[,':='(quarter=quarter(week_end))]
		rms[,':='(m_supc_yq_up=mean(unit_price)),by=.(store_code_uc,upc,upc_ver_uc,year,quarter)]
		rms[,':='(d_supc_yq_up=unit_price-m_supc_yq_up)]
		
		# rms <- readRDS(sprintf('%s/%s_top1_%s.rds',save_path,m,c))
		
		# Reshape data to matrix for correlations 
		dev <- rms[,.(store_code_uc,week_end,d_supc_yq_up)]
		dev <- dcast(dev, store_code_uc ~ week_end, value.var = "d_supc_yq_up") 
		matrix <- as.matrix(dev[,!"store_code_uc",with=F])
		rownames(matrix)<-dev[,store_code_uc]	
		
		# Compute correlations, taking care of missing values in certain weeks 
		cor_matrix <- cor(t(matrix),use="pairwise.complete.obs")
		colnames(cor_matrix) <- dev[,store_code_uc] 
		# Possible to retain the number of observations used to calculate correlation for each pair to ensure sufficient sample size?
		count_matrix <- count.pairwise(t(matrix))
		
		# Name and reshape correlation matrix 
		cor_data <- data.table(cor_matrix)
		cor_data[,store_code_uc_2:=dev[,store_code_uc]]
			
		# Melt correlations to long form
		long_cor <- melt(cor_data, measure = list(colnames(cor_data)[!(colnames(cor_data) %in% "store_code_uc_2")]))

		# Note value vs. value1 in different versions of melt?
		long_cor[,':=' (store_code_uc_1=as.numeric(levels(variable))[variable],correlation=value1,variable=NULL,value1=NULL)]
					
		# Name and reshape count matrix 
		count_data <- data.table(count_matrix)
		count_data[,store_code_uc_2:=dev[,store_code_uc]]
			
		# Melt counts to long form
		long_count <- melt(count_data, measure = list(colnames(count_data)[!(colnames(count_data) %in% "store_code_uc_2")]))
		
		long_count[,':=' (store_code_uc_1=as.numeric(levels(variable))[variable],count=value1,variable=NULL,value1=NULL)]
		
		# Drop symmetric and diagonal elements
		long_cor <- long_cor[store_code_uc_1>store_code_uc_2,]
		long_count <- long_count[store_code_uc_1>store_code_uc_2,]
		
		# Merge correlations and counts
		setkey(long_cor,store_code_uc_1,store_code_uc_2)
		setkey(long_count,store_code_uc_1,store_code_uc_2)
		
		long <- long_cor[long_count, nomatch=0] 
		setcolorder(long, c("store_code_uc_1","store_code_uc_2","correlation","count"))
		
		saveRDS(long,file=sprintf('%s/%s_top1_%s_cor.rds',save_path,m,c))

		# Save to dta and add key variables 	
		# long <- readRDS(file=sprintf('%s/%s_top1_%s_cor.rds',save_path,m,c))

		# Merge chain characteristics 
		stores <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
		stores <- stores[,.(store_code_uc,year,parent_code,retailer_code,channel_code,store_zip3,fips_state_code,fips_county_code,dma_code)]

		col_names_1 <- paste(colnames(stores),"_1",sep="")
		col_names_2 <- paste(colnames(stores),"_2",sep="")
		stores_1 <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
		stores_2 <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
		stores_1 <- stores_1[,.(store_code_uc,year,parent_code,retailer_code,channel_code,store_zip3,fips_state_code,fips_county_code,dma_code)]
		stores_2 <- stores_2[,.(store_code_uc,year,parent_code,retailer_code,channel_code,store_zip3,fips_state_code,fips_county_code,dma_code)]
		
		colnames(stores_1) <- col_names_1
		colnames(stores_2) <- col_names_2 			
		long[,':='(year_1=2007,year_2=2007)]
		
		setkey(long,store_code_uc_1,year_1)
		setkey(stores_1,store_code_uc_1,year_1)
		long <- long[stores_1, nomatch = 0] 

		setkey(long,store_code_uc_2,year_2)
		setkey(stores_2,store_code_uc_2,year_2)
		long <- long[stores_2, nomatch=0]
		
		# Same parent indicator
		long[,same_parent:=ifelse(parent_code_1==parent_code_2,1,0)]
		
		setkey(long,store_code_uc_1,store_code_uc_2)
		
		# write.csv()
		write.dta(long, file = sprintf('%s/%s_top1_%s_cor.dta',save_path,m,c))

		print(sprintf('%s %s Done',m,c))
		
		gc()
	}	
}